Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in the first code chunks of your document.

housedata <- read.csv("~/git_repositories/STT3851ClassRepo/Rmarkdown/Data/housedata.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1))  # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("~/git_repositories/STT3851ClassRepo/Rmarkdown/Data/housedataTEST.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1))  # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 12:20], rownames = FALSE)

Consider predicting the price (price) of a house based on a certain feature (sqft_living). Start by graphing the relationship.

library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  theme_bw()
p1

Overplotting is problematic. What should we do?

0.0.1 Using alpha

p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        geom_point(alpha = 0.05, color = "blue") + 
        theme_bw() 
p2

0.0.2 Using rectangles

p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        theme_bw()
p3

p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000)) +
        theme_bw()
p4

0.0.3 Using hexagons

p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
        theme_bw()
p5

**Note* For both stat_bin2d and stat_binhex, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.

p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
        theme_bw()
p6

library(car)     # red line affected by outlier, green ignoring the outlier (robust)
scatterplot(x =housedata$price, y=housedata$bedrooms)       #( not much dependent)

scatterplot(x =housedata$bedrooms, y=housedata$bathrooms)   #( dependent excluding the 33 outlier)

scatterplot(x =housedata$price, y=housedata$bathrooms)      #( dependent)

scatterplot(x =housedata$price, y=housedata$sqft_lot)       #( not dependent)

scatterplot(x =housedata$price, y=housedata$view)           #( dependent but mostly view is 0 so not dependent)

scatterplot(x =housedata$price, y=housedata$grade)          #( dependent)

scatterplot(x =housedata$price, y=housedata$floors)         #( not dependent from boxplot)

scatterplot(x =housedata$price, y=housedata$condition)      #( nearly not dependent)

scatterplot(x =housedata$price, y=housedata$waterfront)     #( not dependent as nearly no waterfront)

scatterplot(x =housedata$price, y=housedata$bedrooms)       #( not much dependent)

scatterplot(x =housedata$price, y=housedata$sqft_above)     #( dependent)

scatterplot(x =housedata$price, y=housedata$sqft_basement)  #( dependent)

#scatterplot(x =housedata$price, y=housedata$age)            #( low negative dependent)
scatterplot(x =housedata$price, y=housedata$zipcode)        

#scatterplot(x =housedata$price, y=housedata$renage)         #( not dependent as very less houses are renovated)
# scatterplot using ggplot
ggplot(data = housedata, mapping = aes(x = sqft_living, y = price)) + geom_point(colour = 'skyblue') + geom_smooth(method = 'lm')

# creating a data frame excluding id,date,sqft_living15,sqft_lot15, lat,long,yr_built,yr_renovated,rate
housedata1<- housedata[ ,c(2:7,9,12,13,17:20)]

# Correlation matric
cor(housedata1)
                   price    bedrooms  bathrooms sqft_living     sqft_lot
price         1.00000000  0.31284286 0.52334477  0.70291635  0.088238107
bedrooms      0.31284286  1.00000000 0.52923162  0.59105983  0.030179053
bathrooms     0.52334477  0.52923162 1.00000000  0.75455302  0.082139581
sqft_living   0.70291635  0.59105983 0.75455302  1.00000000  0.166967283
sqft_lot      0.08823811  0.03017905 0.08213958  0.16696728  1.000000000
floors        0.25235756  0.18028523 0.50066694  0.35267511 -0.002951851
view          0.39102268  0.07884375 0.18312596  0.27981310  0.069978368
sqft_above    0.60527752  0.49174312 0.68455295  0.87631944  0.176005462
sqft_basement 0.33122956  0.31056084 0.29077234  0.44288611  0.018691884
lat           0.30948443 -0.01002422 0.02676418  0.05693083 -0.085417697
long          0.02131272  0.13604729 0.22151426  0.23737409  0.225347502
sqft_living15 0.58348082  0.40306677 0.56816564  0.75627424  0.147707827
sqft_lot15    0.08080643  0.02784234 0.08467962  0.17830644  0.727774079
                    floors         view   sqft_above sqft_basement
price          0.252357558  0.391022681  0.605277522    0.33122956
bedrooms       0.180285231  0.078843754  0.491743119    0.31056084
bathrooms      0.500666944  0.183125959  0.684552945    0.29077234
sqft_living    0.352675112  0.279813103  0.876319445    0.44288611
sqft_lot      -0.002951851  0.069978368  0.176005462    0.01869188
floors         1.000000000  0.026258735  0.522710921   -0.24145116
view           0.026258735  1.000000000  0.163954243    0.27514730
sqft_above     0.522710921  0.163954243  1.000000000   -0.04379916
sqft_basement -0.241451164  0.275147303 -0.043799158    1.00000000
lat            0.049952734  0.008885553  0.001422037    0.11541978
long           0.125918561 -0.076033506  0.341128260   -0.14261452
sqft_living15  0.280417017  0.279937620  0.732554007    0.20500421
sqft_lot15    -0.007389463  0.068809179  0.188503973    0.01894595
                       lat        long sqft_living15   sqft_lot15
price          0.309484427  0.02131272    0.58348082  0.080806426
bedrooms      -0.010024220  0.13604729    0.40306677  0.027842339
bathrooms      0.026764178  0.22151426    0.56816564  0.084679619
sqft_living    0.056930825  0.23737409    0.75627424  0.178306444
sqft_lot      -0.085417697  0.22534750    0.14770783  0.727774079
floors         0.049952734  0.12591856    0.28041702 -0.007389463
view           0.008885553 -0.07603351    0.27993762  0.068809179
sqft_above     0.001422037  0.34112826    0.73255401  0.188503973
sqft_basement  0.115419784 -0.14261452    0.20500421  0.018945951
lat            1.000000000 -0.13331116    0.04993821 -0.089826111
long          -0.133311159  1.00000000    0.33399885  0.253890095
sqft_living15  0.049938206  0.33399885    1.00000000  0.184561578
sqft_lot15    -0.089826111  0.25389009    0.18456158  1.000000000
# Corrplot
library(corrplot)
corrplot(cor(housedata1))

# converting catagorical variables from numeric variables ( bedroom, bathroom, grade ,zipcode)
#housedata$bedrooms <- as.factor(housedata$bedrooms)
#housedata$bathrooms <- as.factor(housedata$bathrooms)
#housedata$grade <- as.factor(housedata$grade)
#housedata$zipcode <- as.factor(housedata$zipcode)

# Checking structure now
str(housedata)
'data.frame':   17384 obs. of  20 variables:
 $ date         : Date, format: "2014-10-13" "2014-12-09" ...
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ condition    : Factor w/ 5 levels "poor","fair",..: 3 3 3 5 3 3 3 3 3 3 ...
 $ grade        : Factor w/ 12 levels "1","10","11",..: 10 10 9 10 11 3 10 10 10 10 ...
 $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
 $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
 $ yr_built     : Date, format: "1955-09-01" "1951-09-01" ...
 $ yr_renovated : Date, format: "1955-09-01" "1991-09-01" ...
 $ zipcode      : Factor w/ 70 levels "98001","98002",..: 67 56 17 59 38 30 3 69 61 24 ...
 $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
 $ long         : num  -122 -122 -122 -122 -122 ...
 $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
 $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
model1 <- lm(price~ sqft_living + bedrooms + bathrooms + grade + sqft_above,data = housedata)
summary(model1)

Call:
lm(formula = price ~ sqft_living + bedrooms + bathrooms + grade + 
    sqft_above, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1628606  -123162   -25774    90436  4581884 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.987e+04  2.348e+05   0.425 0.670595    
sqft_living  2.427e+02  4.932e+00  49.205  < 2e-16 ***
bedrooms    -2.629e+04  2.576e+03 -10.205  < 2e-16 ***
bathrooms   -3.834e+03  3.745e+03  -1.024 0.306051    
grade10      5.383e+05  2.352e+05   2.288 0.022132 *  
grade11      8.410e+05  2.356e+05   3.569 0.000359 ***
grade12      1.389e+06  2.369e+05   5.863 4.62e-09 ***
grade13      2.289e+06  2.454e+05   9.327  < 2e-16 ***
grade3       4.637e+04  2.711e+05   0.171 0.864216    
grade4       5.160e+04  2.416e+05   0.214 0.830906    
grade5       7.285e+04  2.355e+05   0.309 0.757049    
grade6       9.235e+04  2.349e+05   0.393 0.694289    
grade7       1.228e+05  2.349e+05   0.523 0.601199    
grade8       1.915e+05  2.350e+05   0.815 0.415073    
grade9       3.354e+05  2.351e+05   1.427 0.153706    
sqft_above  -9.740e+01  4.719e+00 -20.641  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 234800 on 17368 degrees of freedom
Multiple R-squared:  0.597, Adjusted R-squared:  0.5966 
F-statistic:  1715 on 15 and 17368 DF,  p-value: < 2.2e-16
anova(model1)
Analysis of Variance Table

Response: price
               Df     Sum Sq    Mean Sq    F value  Pr(>F)    
sqft_living     1 1.1738e+15 1.1738e+15 21292.8959 < 2e-16 ***
bedrooms        1 3.8454e+13 3.8454e+13   697.5391 < 2e-16 ***
bathrooms       1 2.0958e+11 2.0958e+11     3.8017 0.05122 .  
grade          11 1.8230e+14 1.6572e+13   300.6138 < 2e-16 ***
sqft_above      1 2.3488e+13 2.3488e+13   426.0662 < 2e-16 ***
Residuals   17368 9.5747e+14 5.5128e+10                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model2 <- lm(price~ sqft_living + bedrooms + bathrooms + grade + sqft_above + zipcode,data = housedata)
summary(model2)

Call:
lm(formula = price ~ sqft_living + bedrooms + bathrooms + grade + 
    sqft_above + zipcode, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1553993   -71559    -3707    56624  4046688 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -8.408e+04  1.741e+05  -0.483 0.629206    
sqft_living   1.881e+02  3.730e+00  50.419  < 2e-16 ***
bedrooms     -2.241e+04  1.932e+03 -11.599  < 2e-16 ***
bathrooms     6.484e+03  2.801e+03   2.315 0.020651 *  
grade10       2.927e+05  1.742e+05   1.680 0.092986 .  
grade11       5.133e+05  1.745e+05   2.942 0.003269 ** 
grade12       1.000e+06  1.755e+05   5.699 1.22e-08 ***
grade13       1.802e+06  1.818e+05   9.913  < 2e-16 ***
grade3        1.342e+05  2.006e+05   0.669 0.503654    
grade4        7.677e+04  1.787e+05   0.430 0.667555    
grade5        8.142e+04  1.744e+05   0.467 0.640510    
grade6        5.785e+04  1.740e+05   0.333 0.739460    
grade7        5.220e+04  1.740e+05   0.300 0.764118    
grade8        6.994e+04  1.740e+05   0.402 0.687712    
grade9        1.512e+05  1.741e+05   0.868 0.385212    
sqft_above    8.337e-01  3.764e+00   0.222 0.824702    
zipcode98002  1.932e+04  1.710e+04   1.130 0.258670    
zipcode98003  5.637e+03  1.526e+04   0.370 0.711749    
zipcode98004  7.750e+05  1.493e+04  51.892  < 2e-16 ***
zipcode98005  3.028e+05  1.826e+04  16.581  < 2e-16 ***
zipcode98006  2.677e+05  1.347e+04  19.874  < 2e-16 ***
zipcode98007  2.551e+05  1.895e+04  13.463  < 2e-16 ***
zipcode98008  3.186e+05  1.524e+04  20.905  < 2e-16 ***
zipcode98010  6.401e+04  2.162e+04   2.961 0.003074 ** 
zipcode98011  1.294e+05  1.738e+04   7.448 9.93e-14 ***
zipcode98014  8.334e+04  2.030e+04   4.106 4.05e-05 ***
zipcode98019  8.429e+04  1.750e+04   4.816 1.48e-06 ***
zipcode98022  4.516e+04  1.645e+04   2.745 0.006062 ** 
zipcode98023 -2.341e+04  1.333e+04  -1.757 0.078989 .  
zipcode98024  1.713e+05  2.316e+04   7.397 1.46e-13 ***
zipcode98027  1.576e+05  1.414e+04  11.148  < 2e-16 ***
zipcode98028  1.316e+05  1.541e+04   8.539  < 2e-16 ***
zipcode98029  2.097e+05  1.493e+04  14.050  < 2e-16 ***
zipcode98030  1.881e+03  1.550e+04   0.121 0.903393    
zipcode98031  2.128e+04  1.532e+04   1.389 0.164933    
zipcode98032  9.784e+03  1.997e+04   0.490 0.624196    
zipcode98033  3.721e+05  1.382e+04  26.927  < 2e-16 ***
zipcode98034  2.155e+05  1.303e+04  16.536  < 2e-16 ***
zipcode98038  2.857e+04  1.283e+04   2.228 0.025895 *  
zipcode98039  1.178e+06  2.815e+04  41.857  < 2e-16 ***
zipcode98040  5.742e+05  1.593e+04  36.057  < 2e-16 ***
zipcode98042  1.118e+04  1.312e+04   0.852 0.394156    
zipcode98045  1.007e+05  1.666e+04   6.043 1.55e-09 ***
zipcode98052  2.394e+05  1.301e+04  18.402  < 2e-16 ***
zipcode98053  1.905e+05  1.405e+04  13.561  < 2e-16 ***
zipcode98055  4.202e+04  1.554e+04   2.704 0.006867 ** 
zipcode98056  1.073e+05  1.389e+04   7.725 1.18e-14 ***
zipcode98058  3.893e+04  1.352e+04   2.880 0.003985 ** 
zipcode98059  7.654e+04  1.358e+04   5.636 1.76e-08 ***
zipcode98065  8.202e+04  1.473e+04   5.567 2.63e-08 ***
zipcode98070  1.925e+05  2.054e+04   9.373  < 2e-16 ***
zipcode98072  1.511e+05  1.538e+04   9.826  < 2e-16 ***
zipcode98074  1.710e+05  1.375e+04  12.437  < 2e-16 ***
zipcode98075  1.808e+05  1.443e+04  12.530  < 2e-16 ***
zipcode98077  9.797e+04  1.736e+04   5.644 1.68e-08 ***
zipcode98092 -2.236e+04  1.418e+04  -1.577 0.114805    
zipcode98102  5.005e+05  2.248e+04  22.270  < 2e-16 ***
zipcode98103  3.386e+05  1.294e+04  26.158  < 2e-16 ***
zipcode98105  5.094e+05  1.635e+04  31.154  < 2e-16 ***
zipcode98106  1.165e+05  1.479e+04   7.877 3.56e-15 ***
zipcode98107  3.559e+05  1.554e+04  22.900  < 2e-16 ***
zipcode98108  1.175e+05  1.784e+04   6.583 4.75e-11 ***
zipcode98109  5.204e+05  2.097e+04  24.813  < 2e-16 ***
zipcode98112  6.090e+05  1.563e+04  38.957  < 2e-16 ***
zipcode98115  3.472e+05  1.291e+04  26.901  < 2e-16 ***
zipcode98116  3.235e+05  1.475e+04  21.934  < 2e-16 ***
zipcode98117  3.288e+05  1.304e+04  25.211  < 2e-16 ***
zipcode98118  1.765e+05  1.328e+04  13.294  < 2e-16 ***
zipcode98119  5.177e+05  1.741e+04  29.739  < 2e-16 ***
zipcode98122  3.468e+05  1.538e+04  22.548  < 2e-16 ***
zipcode98125  2.281e+05  1.398e+04  16.323  < 2e-16 ***
zipcode98126  2.075e+05  1.475e+04  14.062  < 2e-16 ***
zipcode98133  1.655e+05  1.333e+04  12.417  < 2e-16 ***
zipcode98136  2.929e+05  1.557e+04  18.810  < 2e-16 ***
zipcode98144  3.015e+05  1.454e+04  20.740  < 2e-16 ***
zipcode98146  1.368e+05  1.523e+04   8.982  < 2e-16 ***
zipcode98148  7.603e+04  2.850e+04   2.667 0.007649 ** 
zipcode98155  1.664e+05  1.381e+04  12.051  < 2e-16 ***
zipcode98166  1.200e+05  1.592e+04   7.540 4.93e-14 ***
zipcode98168  5.026e+04  1.573e+04   3.194 0.001405 ** 
zipcode98177  2.663e+05  1.573e+04  16.926  < 2e-16 ***
zipcode98178  7.320e+04  1.548e+04   4.730 2.26e-06 ***
zipcode98188  3.155e+04  1.909e+04   1.653 0.098385 .  
zipcode98198  5.332e+04  1.527e+04   3.492 0.000481 ***
zipcode98199  3.891e+05  1.490e+04  26.106  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 172600 on 17299 degrees of freedom
Multiple R-squared:  0.7831,    Adjusted R-squared:  0.782 
F-statistic: 743.5 on 84 and 17299 DF,  p-value: < 2.2e-16
anova(model2)
Analysis of Variance Table

Response: price
               Df     Sum Sq    Mean Sq    F value    Pr(>F)    
sqft_living     1 1.1738e+15 1.1738e+15 39406.9608 < 2.2e-16 ***
bedrooms        1 3.8454e+13 3.8454e+13  1290.9421 < 2.2e-16 ***
bathrooms       1 2.0958e+11 2.0958e+11     7.0358  0.007997 ** 
grade          11 1.8230e+14 1.6572e+13   556.3487 < 2.2e-16 ***
sqft_above      1 2.3488e+13 2.3488e+13   788.5248 < 2.2e-16 ***
zipcode        69 4.4217e+14 6.4083e+12   215.1322 < 2.2e-16 ***
Residuals   17299 5.1530e+14 2.9788e+10                         
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(randomForest)
model3 <- randomForest(price ~  bathrooms + sqft_above,
                            data = housedata)

0.1 What features might be visible in a scatterplot?

Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).

slm <- lm(price ~ sqft_living, data = housedata)
summary(slm)

Call:
lm(formula = price ~ sqft_living, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1490607  -148265   -23758   105710  4349512 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -47116.079   4923.344   -9.57   <2e-16 ***
sqft_living    281.959      2.164  130.29   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 263000 on 17382 degrees of freedom
Multiple R-squared:  0.4941,    Adjusted R-squared:  0.4941 
F-statistic: 1.698e+04 on 1 and 17382 DF,  p-value: < 2.2e-16
predict(slm, newdata = data.frame(sqft_living = 2500))
     1 
657781 
p6 + geom_smooth(method = "lm") + 
  geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
  geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") + 
  labs(x = "Living Space (square feet)", y = "Price ($)")

mod.zip most basic model.

mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)

Call:
lm(formula = price ~ 1, data = housedata)

Residuals:
    Min      1Q  Median      3Q     Max 
-464367 -219367  -89367  100633 7160633 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   539367       2804   192.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 369700 on 17383 degrees of freedom

mod.all all current features except sqft_basement.

mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)

Call:
lm(formula = price ~ . - sqft_basement, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1539648   -60120     2700    56110  3478116 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -4.135e+07  6.410e+06  -6.450 1.15e-10 ***
date                1.145e+02  1.016e+01  11.269  < 2e-16 ***
bedrooms           -1.431e+04  1.708e+03  -8.377  < 2e-16 ***
bathrooms           2.394e+04  2.764e+03   8.663  < 2e-16 ***
sqft_living         1.170e+02  3.714e+00  31.495  < 2e-16 ***
sqft_lot            2.574e-01  4.059e-02   6.342 2.32e-10 ***
floors             -2.963e+04  3.344e+03  -8.862  < 2e-16 ***
waterfrontYes       6.641e+05  1.465e+04  45.331  < 2e-16 ***
view                4.953e+04  1.828e+03  27.102  < 2e-16 ***
conditionfair       9.809e+04  3.414e+04   2.873 0.004075 ** 
conditionaverage    1.042e+05  3.174e+04   3.282 0.001032 ** 
conditiongood       1.318e+05  3.175e+04   4.151 3.32e-05 ***
conditionvery good  1.814e+05  3.194e+04   5.677 1.39e-08 ***
grade10             1.173e+05  1.549e+05   0.757 0.448960    
grade11             3.143e+05  1.552e+05   2.025 0.042869 *  
grade12             7.525e+05  1.561e+05   4.821 1.44e-06 ***
grade13             1.661e+06  1.615e+05  10.287  < 2e-16 ***
grade3             -7.935e+02  1.776e+05  -0.004 0.996434    
grade4             -9.356e+04  1.588e+05  -0.589 0.555737    
grade5             -1.035e+05  1.548e+05  -0.668 0.503898    
grade6             -1.122e+05  1.547e+05  -0.725 0.468200    
grade7             -1.091e+05  1.547e+05  -0.705 0.480711    
grade8             -8.819e+04  1.548e+05  -0.570 0.568766    
grade9             -1.235e+04  1.548e+05  -0.080 0.936432    
sqft_above          5.176e+01  3.858e+00  13.416  < 2e-16 ***
yr_built           -3.312e+00  2.798e-01 -11.838  < 2e-16 ***
yr_renovated        2.591e+00  2.847e-01   9.103  < 2e-16 ***
zipcode98002        1.909e+04  1.516e+04   1.259 0.207972    
zipcode98003       -1.207e+04  1.341e+04  -0.900 0.368288    
zipcode98004        7.176e+05  2.455e+04  29.229  < 2e-16 ***
zipcode98005        2.534e+05  2.626e+04   9.650  < 2e-16 ***
zipcode98006        2.205e+05  2.140e+04  10.305  < 2e-16 ***
zipcode98007        2.136e+05  2.691e+04   7.938 2.18e-15 ***
zipcode98008        2.371e+05  2.569e+04   9.230  < 2e-16 ***
zipcode98010        1.108e+05  2.280e+04   4.860 1.18e-06 ***
zipcode98011        5.783e+04  3.358e+04   1.722 0.085099 .  
zipcode98014        1.005e+05  3.709e+04   2.711 0.006722 ** 
zipcode98019        7.545e+04  3.613e+04   2.088 0.036804 *  
zipcode98022        6.976e+04  2.010e+04   3.470 0.000521 ***
zipcode98023       -5.426e+04  1.240e+04  -4.376 1.21e-05 ***
zipcode98024        1.806e+05  3.184e+04   5.672 1.43e-08 ***
zipcode98027        1.737e+05  2.203e+04   7.885 3.33e-15 ***
zipcode98028        3.778e+04  3.257e+04   1.160 0.246053    
zipcode98029        2.355e+05  2.510e+04   9.383  < 2e-16 ***
zipcode98030        1.126e+04  1.465e+04   0.769 0.441829    
zipcode98031        1.564e+04  1.537e+04   1.018 0.308915    
zipcode98032       -1.284e+04  1.792e+04  -0.717 0.473687    
zipcode98033        2.958e+05  2.789e+04  10.608  < 2e-16 ***
zipcode98034        1.264e+05  2.990e+04   4.228 2.37e-05 ***
zipcode98038        7.791e+04  1.655e+04   4.708 2.53e-06 ***
zipcode98039        1.101e+06  3.245e+04  33.942  < 2e-16 ***
zipcode98040        4.680e+05  2.194e+04  21.327  < 2e-16 ***
zipcode98042        2.810e+04  1.418e+04   1.982 0.047456 *  
zipcode98045        1.814e+05  3.066e+04   5.917 3.33e-09 ***
zipcode98052        1.967e+05  2.842e+04   6.922 4.62e-12 ***
zipcode98053        1.808e+05  3.037e+04   5.954 2.67e-09 ***
zipcode98055        2.146e+04  1.718e+04   1.249 0.211728    
zipcode98056        6.367e+04  1.868e+04   3.409 0.000653 ***
zipcode98058        3.359e+04  1.620e+04   2.073 0.038186 *  
zipcode98059        6.952e+04  1.837e+04   3.784 0.000155 ***
zipcode98065        1.346e+05  2.815e+04   4.782 1.75e-06 ***
zipcode98070       -7.192e+04  2.176e+04  -3.305 0.000950 ***
zipcode98072        9.011e+04  3.321e+04   2.713 0.006671 ** 
zipcode98074        1.588e+05  2.686e+04   5.912 3.45e-09 ***
zipcode98075        1.598e+05  2.576e+04   6.204 5.64e-10 ***
zipcode98077        6.050e+04  3.468e+04   1.745 0.081073 .  
zipcode98092       -1.196e+03  1.326e+04  -0.090 0.928155    
zipcode98102        4.390e+05  2.935e+04  14.959  < 2e-16 ***
zipcode98103        2.424e+05  2.709e+04   8.948  < 2e-16 ***
zipcode98105        3.997e+05  2.773e+04  14.414  < 2e-16 ***
zipcode98106        5.587e+04  2.006e+04   2.785 0.005359 ** 
zipcode98107        2.487e+05  2.780e+04   8.948  < 2e-16 ***
zipcode98108        6.070e+04  2.237e+04   2.714 0.006652 ** 
zipcode98109        4.148e+05  2.861e+04  14.498  < 2e-16 ***
zipcode98112        5.380e+05  2.544e+04  21.151  < 2e-16 ***
zipcode98115        2.480e+05  2.744e+04   9.040  < 2e-16 ***
zipcode98116        2.080e+05  2.232e+04   9.320  < 2e-16 ***
zipcode98117        2.180e+05  2.782e+04   7.834 4.99e-15 ***
zipcode98118        1.124e+05  1.948e+04   5.771 8.02e-09 ***
zipcode98119        4.068e+05  2.701e+04  15.061  < 2e-16 ***
zipcode98122        2.721e+05  2.421e+04  11.235  < 2e-16 ***
zipcode98125        1.097e+05  2.968e+04   3.695 0.000220 ***
zipcode98126        1.206e+05  2.068e+04   5.831 5.61e-09 ***
zipcode98133        5.335e+04  3.061e+04   1.743 0.081338 .  
zipcode98136        1.768e+05  2.100e+04   8.420  < 2e-16 ***
zipcode98144        2.204e+05  2.244e+04   9.823  < 2e-16 ***
zipcode98146        3.043e+04  1.876e+04   1.622 0.104828    
zipcode98148        3.650e+04  2.635e+04   1.385 0.165989    
zipcode98155        4.353e+04  3.191e+04   1.364 0.172629    
zipcode98166        5.831e+03  1.725e+04   0.338 0.735412    
zipcode98168        9.246e+03  1.825e+04   0.507 0.612501    
zipcode98177        1.079e+05  3.190e+04   3.382 0.000720 ***
zipcode98178       -5.908e+03  1.861e+04  -0.317 0.750888    
zipcode98188       -4.184e+03  1.898e+04  -0.220 0.825497    
zipcode98198       -3.087e+04  1.448e+04  -2.132 0.033009 *  
zipcode98199        2.724e+05  2.639e+04  10.323  < 2e-16 ***
lat                 2.233e+05  6.632e+04   3.367 0.000761 ***
long               -2.361e+05  4.727e+04  -4.995 5.95e-07 ***
sqft_living15       1.521e+01  3.029e+00   5.023 5.15e-07 ***
sqft_lot15         -1.159e-01  6.429e-02  -1.803 0.071344 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared:  0.8359,    Adjusted R-squared:  0.835 
F-statistic: 889.4 on 99 and 17284 DF,  p-value: < 2.2e-16
anova(mod.all)
Analysis of Variance Table

Response: price
                 Df     Sum Sq    Mean Sq    F value    Pr(>F)    
date              1 2.9956e+10 2.9956e+10     1.3282 0.2491485    
bedrooms          1 2.3249e+14 2.3249e+14 10307.9306 < 2.2e-16 ***
bathrooms         1 4.2285e+14 4.2285e+14 18748.2537 < 2.2e-16 ***
sqft_living       1 5.5816e+14 5.5816e+14 24747.5215 < 2.2e-16 ***
sqft_lot          1 3.8694e+12 3.8694e+12   171.5587 < 2.2e-16 ***
floors            1 9.0483e+10 9.0483e+10     4.0118 0.0451986 *  
waterfront        1 8.3044e+13 8.3044e+13  3681.9764 < 2.2e-16 ***
view              1 3.8892e+13 3.8892e+13  1724.3793 < 2.2e-16 ***
condition         4 1.6341e+13 4.0854e+12   181.1348 < 2.2e-16 ***
grade            11 1.7804e+14 1.6185e+13   717.6222 < 2.2e-16 ***
sqft_above        1 7.0222e+12 7.0222e+12   311.3450 < 2.2e-16 ***
yr_built          1 8.4946e+13 8.4946e+13  3766.2921 < 2.2e-16 ***
yr_renovated      1 4.7609e+11 4.7609e+11    21.1086 4.370e-06 ***
zipcode          69 3.5814e+14 5.1905e+12   230.1340 < 2.2e-16 ***
lat               1 3.1253e+11 3.1253e+11    13.8567 0.0001979 ***
long              1 6.0257e+11 6.0257e+11    26.7164 2.382e-07 ***
sqft_living15     1 5.4648e+11 5.4648e+11    24.2294 8.630e-07 ***
sqft_lot15        1 7.3351e+10 7.3351e+10     3.2522 0.0713445 .  
Residuals     17284 3.8983e+14 2.2554e+10                         
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Your goal is to create a model with as small a test error as possible. Note that the square root of the training RSS from model mod.all is \(1.5018076\times 10^{5}\).

library(ggmap)
KingMap <-
  get_map(
    location = c(lon = -122.1, lat = 47.48),
    zoom = 10,
    source = "google",
    maptype = "roadmap"
  )
ggmap(KingMap) +
  geom_point(
    aes(x = housedata$long, y = housedata$lat),
    data = housedata,
    alpha = .2,
    color = "blue",
    size = 0.01
  ) +
  geom_point(
    aes(x = housedataT$long, y = housedataT$lat),
    data = housedataT,
    alpha = .2,
    color = "red",
    size = 0.01
  ) +
  ggtitle("Houses Sold in King County, Wa (2014-2015)") +
  labs(x = "longitute", y = "latitude")

0.2 Prediction

Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
head(Yourlastname_Yourfirstname)
       1        2        3        4        5        6 
310196.9 845551.2 305946.0 532806.5 485256.6 469122.8 
write.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv")

I will compute your \(\sqrt{MSPE}\).

SMSPE <- sqrt(mean((head(Yourlastname_Yourfirstname) - c(310000, 650000, 233000, 580500, 535000, 605000))^2))
SMSPE
[1] 105493.4